{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-30T08:43:30.129280Z",
     "start_time": "2019-08-30T08:43:29.778546Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import codecs\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 生成训练数据和测试数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-30T08:43:33.176468Z",
     "start_time": "2019-08-30T08:43:32.812270Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('../input/Train_Data.csv')\n",
    "test_df = pd.read_csv('../input/Test_Data.csv')\n",
    "\n",
    "def stop_words(x):\n",
    "    try:\n",
    "        x = x.strip()\n",
    "    except:\n",
    "        return ''\n",
    "    x = re.sub('\\?\\?+','',x)\n",
    "    x = re.sub('\\{IMG:.?.?.?\\}','',x)\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-30T08:43:36.076708Z",
     "start_time": "2019-08-30T08:43:35.990024Z"
    }
   },
   "outputs": [],
   "source": [
    "train_df['text'] =  train_df['title'].fillna('') + train_df['text'].fillna('')\n",
    "test_df['text'] =  test_df['title'].fillna('') + test_df['text'].fillna('')\n",
    "\n",
    "train_df['text'] = train_df['text'].apply(stop_words)\n",
    "test_df['text'] = test_df['text'].apply(stop_words)\n",
    "\n",
    "train_df = train_df[~train_df['unknownEntities'].isnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-30T08:43:51.324141Z",
     "start_time": "2019-08-30T08:43:42.184213Z"
    }
   },
   "outputs": [],
   "source": [
    "with codecs.open('./bert-chinese-ner/data/train.txt', 'w') as up:\n",
    "    for row in train_df.iloc[:-200].itertuples():\n",
    "        # print(row.unknownEntities)\n",
    "\n",
    "        text_lbl = row.text\n",
    "        entitys = str(row.unknownEntities).split(';')\n",
    "        for entity in entitys:\n",
    "            text_lbl = text_lbl.replace(entity, 'Ё' + (len(entity)-1)*'Ж')\n",
    "        \n",
    "        for c1, c2 in zip(row.text, text_lbl):\n",
    "            if c2 == 'Ё':\n",
    "                up.write('{0} {1}\\n'.format(c1, 'B-ORG'))\n",
    "            elif c2 == 'Ж':\n",
    "                up.write('{0} {1}\\n'.format(c1, 'I-ORG'))\n",
    "            else:\n",
    "                up.write('{0} {1}\\n'.format(c1, 'O'))\n",
    "        \n",
    "        up.write('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-30T08:43:51.771978Z",
     "start_time": "2019-08-30T08:43:51.327330Z"
    }
   },
   "outputs": [],
   "source": [
    "with codecs.open('./bert-chinese-ner/data/dev.txt', 'w') as up:\n",
    "    for row in train_df.iloc[-200:].itertuples():\n",
    "        # print(row.unknownEntities)\n",
    "\n",
    "        text_lbl = row.text\n",
    "        entitys = str(row.unknownEntities).split(';')\n",
    "        for entity in entitys:\n",
    "            text_lbl = text_lbl.replace(entity, 'Ё' + (len(entity)-1)*'Ж')\n",
    "        \n",
    "        for c1, c2 in zip(row.text, text_lbl):\n",
    "            if c2 == 'Ё':\n",
    "                up.write('{0} {1}\\n'.format(c1, 'B-ORG'))\n",
    "            elif c2 == 'Ж':\n",
    "                up.write('{0} {1}\\n'.format(c1, 'I-ORG'))\n",
    "            else:\n",
    "                up.write('{0} {1}\\n'.format(c1, 'O'))\n",
    "        \n",
    "        up.write('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-30T08:44:04.267694Z",
     "start_time": "2019-08-30T08:43:53.600622Z"
    }
   },
   "outputs": [],
   "source": [
    "with codecs.open('./bert-chinese-ner/data/test.txt', 'w') as up:\n",
    "    for row in test_df.iloc[:].itertuples():\n",
    "\n",
    "        text_lbl = row.text\n",
    "        for c1 in text_lbl:\n",
    "            up.write('{0} {1}\\n'.format(c1, 'O'))\n",
    "        \n",
    "        up.write('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with codecs.open('./bert-chinese-ner/data/dev.txt', 'w') as up:\n",
    "    for row in train_df.iloc[-500:].itertuples():\n",
    "        # print(row.unknownEntities)\n",
    "\n",
    "        text_lbl = row.text\n",
    "        entitys = str(row.unknownEntities).split(';')\n",
    "        for entity in entitys:\n",
    "            text_lbl = text_lbl.replace(entity, 'Ё' + (len(entity)-1)*'Ж')\n",
    "        \n",
    "        for c1, c2 in zip(row.text, text_lbl):\n",
    "            if c2 == 'Ё':\n",
    "                up.write('{0} {1}\\n'.format(c1, 'B-ORG'))\n",
    "            elif c2 == 'Ж':\n",
    "                up.write('{0} {1}\\n'.format(c1, 'I-ORG'))\n",
    "            else:\n",
    "                up.write('{0} {1}\\n'.format(c1, 'O'))\n",
    "        \n",
    "        up.write('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with codecs.open('./bert-chinese-ner/data/test.txt', 'w') as up:\n",
    "    for row in test_df.iloc[:].itertuples():\n",
    "\n",
    "        text_lbl = row.text\n",
    "        for c1 in text_lbl:\n",
    "            up.write('{0} {1}\\n'.format(c1, 'O'))\n",
    "        \n",
    "        up.write('\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 生成提交文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-30T10:21:13.335886Z",
     "start_time": "2019-08-30T10:21:10.932918Z"
    }
   },
   "outputs": [],
   "source": [
    "test_pred = codecs.open('./bert-chinese-ner/output/result_dir/label_test.txt').readlines()\n",
    "\n",
    "pred_tag = []\n",
    "pred_word = []\n",
    "\n",
    "pred_line_tag = ''\n",
    "pred_line_word = ''\n",
    "\n",
    "for line in test_pred:\n",
    "    line = line.strip()\n",
    "    \n",
    "    if len(line) == 0 or line == '':\n",
    "        pred_tag.append(pred_line_tag)\n",
    "        pred_word.append(pred_line_word)\n",
    "        pred_line_tag = ''\n",
    "        pred_line_word = ''\n",
    "        continue\n",
    "    \n",
    "    c, _, tag = line.split(' ')\n",
    "    \n",
    "    if tag != 'O':\n",
    "        tag = tag[1:]\n",
    "        pred_line_word += c\n",
    "    else:\n",
    "        pred_line_word += ';'\n",
    "        \n",
    "    pred_line_tag += tag    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-30T10:21:16.779637Z",
     "start_time": "2019-08-30T10:21:16.564624Z"
    }
   },
   "outputs": [],
   "source": [
    "def filter_word(w):\n",
    "    for wbad in ['？','《','🔺','️?','!','#','%','%','，','Ⅲ','》','丨','、','）','（','​',\n",
    "                '👍','。','😎','/','】','-','⚠️','：','✅','㊙️','“',')','(','！','🔥',',']:\n",
    "        if wbad in w:\n",
    "            return ''\n",
    "    return w\n",
    "\n",
    "with codecs.open('baseline2.csv', 'w') as up:\n",
    "    up.write('id,unknownEntities\\n')\n",
    "    \n",
    "    for word, id in zip(pred_word, test_df['id'].values):\n",
    "        word = set([filter_word(x) for x in word.split(';') if x not in ['', ';'] and len(x) > 1])\n",
    "        word = [x for x in word if x != '']\n",
    "        \n",
    "        if len(word) == 0:\n",
    "            word = ['我们']\n",
    "        \n",
    "        word = ';'.join(list(word))\n",
    "        up.write('{0},{1}\\n'.format(id, word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
